Data source: https://www.kaggle.com/competitions/house-prices-advanced-regression-techniques

Prepare the workspace

options(
  digits = 2,
  scipen = 999,
  warn = -1
)
rm(
  list = ls()
)
library(magrittr)

1. Load data description, create data frame with levels (Optional)

Used for when theres a lot of variables and dont want to go in and name levels manually.

v_data_description <- readr::read_file(
  file = "/Users/thienpham/Data Mining/data/house-prices-advanced-regression-techniques/data_description.txt"
) # Gives 1 long string
v_data_description <- stringr::str_split( # splits up the string by "\n" new line
  string = v_data_description,
  pattern = "\n"
) %>%
  unlist()


v_colnames <- grep( # we see that column names are followed by a : so we mark row index of column name 
  x = v_data_description,
  pattern = ":"
)
M_data_description <- data.frame( # then build a dataframe of the column name and its levels
  column = NA,
  levels = v_data_description
)


for(j in 1:nrow(M_data_description)){ # here we extract what we want (column names)
  if(grepl(x = M_data_description$levels[j],pattern = ": \\D")) column <- M_data_description$levels[j]
  M_data_description$column[j] <- column
}


M_data_description$levels <- trimws(gsub(
  x = M_data_description$levels,
  pattern = "\t.*",  # \t = tabs in the dataset, . = any character after it, * = 0 or more times
  replacement = ""   # replace with ""
))
M_data_description <- M_data_description[!grepl( # looking for things that are NOT column names
  x = M_data_description$levels,
  pattern = ":" # remember : follows behind column names
),]
M_data_description <- M_data_description[M_data_description$levels != "",]
M_data_description$column <- gsub(
  x = M_data_description$column,
  pattern = ":.*",   # deleting everything from : onwards
  replacement = ""   # replacing with ""
) %>%
  trimws() # %>%janitor::make_clean_names()
M_data_description

2. Load the data (train and test), row bind the data

Row bind depending on data and specific need. Needed for this case.

library(magrittr)
col_types <- paste0(rep("c",82),collapse = "") # read everything in as character to preserve ex: 00934
M_train <- readr::read_csv(
  file = "/Users/thienpham/Data Mining/data/house-prices-advanced-regression-techniques/train.csv",
  col_types = col_types,
  name_repair = "minimal"
) %>%
  as.data.frame() %>%
  dplyr::mutate(TrainTest = "Train")


M_test <- readr::read_csv(
  file = "/Users/thienpham/Data Mining/data/house-prices-advanced-regression-techniques/test.csv",
  col_types = col_types,
  name_repair = "minimal"
) %>%
  as.data.frame() %>%
  dplyr::mutate(SalePrice = NA,TrainTest = "Test") # create 2 new columns in test data so everything matches
                                                   # up for row binding


M <- dplyr::bind_rows(
  M_train,M_test # rowbinds
) %>%
  dplyr::mutate( # then turn target variable numeric since it was originally read as character
    SalePrice = as.numeric(
      x = SalePrice
    ),
  ) %>%
  dplyr::select(-PoolArea) %>% # essentially constant the almost entire time so useless variable = remove
  as.data.frame()

3. Fill-in missing values for non-numeric columns

Remeber this makes a new level called “N/A”.

v_class <- sapply(
  X = M,
  FUN = class
)
for(j in colnames(M)) if(v_class[j] == "character"){
  M[is.na(M[,j]),j] <- "N/A"
  M[M[,j] == "NA",j] <- "N/A"
}

4. Use data description to set categorical levels (optional)

Sometimes there are levels in the actual data that arent in the data description including the “N/A” level we made.

for(j in unique(M_data_description$column)){
  v_levels <- M_data_description$levels[M_data_description$column == j] # take the levels for the jth column          
  v_levels <- c(
    v_levels[v_levels %in% M[,j]],unique(M[!M[,j] %in% v_levels,j]) # then include the levels that are in the actual     
  )                                                                 # data that are not in the description   
  M[,j] <- factor(
    x = M[,j],
    levels = v_levels
  )
}

5. Convert numeric columns back to numbers

M$SalePrice <- as.numeric(
  x = M$SalePrice
)
v_numeric <- c(
  "LotFrontage","LotArea","YearBuilt","YearRemodAdd","MasVnrArea","BsmtFinSF1",
  "BsmtFinSF2","BsmtUnfSF","TotalBsmtSF","1stFlrSF","2ndFlrSF","LowQualFinSF",
  "GrLivArea","BsmtFullBath","BsmtHalfBath","FullBath","HalfBath","BedroomAbvGr",
  "KitchenAbvGr","TotRmsAbvGrd","Fireplaces","GarageYrBlt","GarageCars",
  "GarageArea","WoodDeckSF","OpenPorchSF","EnclosedPorch","3SsnPorch",
  "ScreenPorch","MiscVal","MoSold","YrSold"
)
for(j in v_numeric) M[,j] <- as.numeric(
  x = M[,j]
)

6. Visualize the target variable

We visualize our data to

library(ggplot2)
M$Bin <- cut_number( # bin the target variable into 3 different range group for color purposes later
  x = M$SalePrice,
  n = 3,
  closed = "left"
)

M %>%
  dplyr::filter(!is.na(SalePrice)) %>%  #dplyr::filter subsets a group under the hood for the next step
  ggplot() +
  aes(x = SalePrice,fill = Bin) +
  geom_histogram() +
  theme_bw() +
  labs(
    title = "Histogram of sale prices",
    subtitle = "Notice that the tail on the right.\nEach color has 33% of data.",
    caption = "Data source: https://www.kaggle.com/competitions/house-prices-advanced-regression-techniques"
  )
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Notice our data is right skewed so we want to apply a boxcox transformation to make the data a more symmetric distribution.

# Calculates the BoxCox transform
preProcess_SalePrice <- caret::preProcess(
  x = M %>% dplyr::select(SalePrice),
  method = "BoxCox"
)

# Performs the BoxCox transform on the target variable
M$BoxCox_SalePrice <- predict(
  object= preProcess_SalePrice,
  newdata = M
)$SalePrice


M %>%
  dplyr::filter(!is.na(SalePrice)) %>%
  ggplot() +
  aes(x = BoxCox_SalePrice,fill = Bin) +
  geom_histogram() +
  theme_bw() +
  labs(
    title = "Histogram of Box-Cox transformed sale prices",
    subtitle = "Notice that the histogram is closer to symmetric.\nEach color has 33% of data.",
    caption = "Data source: https://www.kaggle.com/competitions/house-prices-advanced-regression-techniques"
  )
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Notice the Boxcox transform creates a new variable not actually changing our target variable.

7. Visualize target variable by categorical predictors

Use Wilcoxon-signed-rank test p-values to order the categorical predictor variables by impact (first being most impactful). * If the boxes are far apart from each other vertically, then it implies a good predictor variable. * If the boxes are at similar heights then it does not do a good job informing us about the target variable. * The boxes are also color filled in based off of their median value.

v_wilcox.test <- sapply(
  X = unique(
    x = M_data_description$column
  ),
  FUN = function(j) mean(as.vector(pairwise.wilcox.test(
    x = M$SalePrice[!is.na(M$SalePrice)],
    g = M[!is.na(M$SalePrice),j]
  )$p.value),na.rm = TRUE)
)
v_wilcox.test <- sort( # Sorts the 
  x = v_wilcox.test
)


library(ggplot2)
for(j in names(v_wilcox.test)){
  M_plot <- M[,c("SalePrice",j)] %>%
    dplyr::group_by_(j) %>%
    dplyr::mutate(median_SalePrice = median(SalePrice,na.rm = TRUE)) %>%
    dplyr::ungroup()
  
  p <- ggplot(M_plot) + 
    aes_string(x = j) + 
    aes(y = SalePrice,fill = median_SalePrice) +
    geom_boxplot(color = "grey50") +
    theme_bw() +
    labs(
      title = paste0("Box plots of sale price by ",j),
      subtitle = "Boxes with very different vertical positions give better predictors",
      caption = "Data source: https://www.kaggle.com/competitions/house-prices-advanced-regression-techniques"
    )
  plot(p)
}

8. Cap extreme numeric values

Prevents outliers from pulling model away from the rest of the data.

# Fixing up the names
colnames(M)[colnames(M) %in% v_numeric] <- janitor::make_clean_names(
  string = colnames(M)[colnames(M) %in% v_numeric]
)
v_numeric <- janitor::make_clean_names(
  string = v_numeric
)

# Capping extreme values outside of the 99 percentile to the highest capped value at 99 percentile.
# Example: 1, 14,15,16,15,16 100, we will change 100 to 16 and 1 to 14
for(j in v_numeric){
  v_cutoffs <- quantile(
    x = M[,j],
    probs = c(
      0.01,0.99
    ),
    na.rm = TRUE
  )
  M[M[,j] < v_cutoffs[1] & !is.na(M[,j] < v_cutoffs[1]),j] <- v_cutoffs[1]
  M[v_cutoffs[2] < M[,j] & !is.na(v_cutoffs[2] < M[,j]),j] <- v_cutoffs[2]
}

9. Apply Box-Cox, Yeo-Johnson, exponential transforms to numeric predictors

During data preparation exploration

v_BoxCox <- c(
  "lot_frontage","lot_area","year_built","year_remod_add","x1st_flr_sf","gr_liv_area","tot_rms_abv_grd","garage_yr_blt","mo_sold"
)
v_YeoJohnson <- c(
  "mas_vnr_area","bsmt_fin_sf1","bsmt_fin_sf2","bsmt_unf_sf","total_bsmt_sf","x2nd_flr_sf","wood_deck_sf","open_porch_sf","enclosed_porch"
)
v_notransform <- c(
  "low_qual_fin_sf","bsmt_half_bath","x3ssn_porch","pool_area","misc_val",
  "garage_cars","garage_area","full_bath","fireplaces","half_bath","bedroom_abv_gr","bsmt_full_bath","kitchen_abv_gr","screen_porch","pool_area","yr_sold","bsmt_half_bath"
)
preProcess_BoxCox <- caret::preProcess(
  x = M[,v_BoxCox],
  method = "BoxCox"
)
preProcess_YeoJohnson <- caret::preProcess(
  x = M[,v_YeoJohnson],
  method = "YeoJohnson"
)
M[,v_BoxCox] <- predict(
  object = preProcess_BoxCox,
  newdata = M[,v_BoxCox]
)
M[,v_YeoJohnson] <- predict(
  object = preProcess_YeoJohnson,
  newdata = M[,v_YeoJohnson]
)

10. Visualize the target variable by numeric predictors

v_cor <- sapply(
  X = v_numeric,
  FUN = function(j) cor(
    x = M[,j],
    y = M$BoxCox_SalePrice,
    method = "spearman",
    use = "pairwise.complete.obs"
  )
)


v_cor <- v_cor[order( # orders the correlation from largest absolute value to smallest
  x = abs(
    x = v_cor
  ),
  decreasing = TRUE
)]


for(j in names(v_cor)){
  M_plot <- M[,c("BoxCox_SalePrice","Bin",j)]
  p <- ggplot(M_plot[complete.cases(M_plot),]) +
    aes_string(x = j) +
    aes(y = BoxCox_SalePrice) + 
    geom_point(aes(color = Bin)) + 
    geom_smooth(color = "black",se = FALSE,method = "loess") + # loess method was used b/c linear regression was too rigid
    theme_bw() + 
    labs(
      title = paste0("Scatter plot of sale price by ",j),
      subtitle = paste0("Spearman correlation: ",round(v_cor[j],2),"\nEach color has 33% of the points."),
      caption = "Data source: https://www.kaggle.com/competitions/house-prices-advanced-regression-techniques"
    )
  plot(p)
}
## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

11. Fill-in missing values for numeric predictor variables

Normally preprocess function from the caret package would be preferred because every time we get new data we do not have to refit but the missranger package was used here because we are working with both the train and test set which are finalized.

v_predictors <- c(
  v_numeric,names(v_wilcox.test)
)
M[,v_predictors] <- missRanger::missRanger(
  data = M[,v_predictors],
  returnOOB = TRUE,
  seed = 823,
  maxiter = 100
)
## 
## Missing value imputation by random forests
## 
##   Variables to impute:       lot_frontage, mas_vnr_area, bsmt_fin_sf1, bsmt_fin_sf2, bsmt_unf_sf, total_bsmt_sf, bsmt_full_bath, bsmt_half_bath, garage_yr_blt, garage_cars, garage_area
##   Variables used to impute:  lot_frontage, lot_area, year_built, year_remod_add, mas_vnr_area, bsmt_fin_sf1, bsmt_fin_sf2, bsmt_unf_sf, total_bsmt_sf, x1st_flr_sf, x2nd_flr_sf, low_qual_fin_sf, gr_liv_area, bsmt_full_bath, bsmt_half_bath, full_bath, half_bath, bedroom_abv_gr, kitchen_abv_gr, tot_rms_abv_grd, fireplaces, garage_yr_blt, garage_cars, garage_area, wood_deck_sf, open_porch_sf, enclosed_porch, x3ssn_porch, screen_porch, misc_val, mo_sold, yr_sold, CentralAir, GarageFinish, KitchenQual, ExterQual, PavedDrive, BsmtExposure, BsmtQual, MSZoning, FireplaceQu, BsmtCond, OverallQual, LandContour, Street, MasVnrType, GarageType, HeatingQC, BsmtFinType1, Alley, Neighborhood, LotShape, HouseStyle, BldgType, Fence, Foundation, MSSubClass, SaleCondition, OverallCond, LandSlope, GarageQual, Utilities, ExterCond, GarageCond, Condition1, BsmtFinType2, LotConfig, Functional, Heating, Exterior1st, Electrical, Exterior2nd, SaleType, MiscFeature, PoolQC, RoofStyle, RoofMatl, Condition2
## 
## iter 1
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======                                                                |   9%
  |                                                                            
  |=============                                                         |  18%
  |                                                                            
  |===================                                                   |  27%
  |                                                                            
  |=========================                                             |  36%
  |                                                                            
  |================================                                      |  45%
  |                                                                            
  |======================================                                |  55%
  |                                                                            
  |=============================================                         |  64%
  |                                                                            
  |===================================================                   |  73%
  |                                                                            
  |=========================================================             |  82%
  |                                                                            
  |================================================================      |  91%
  |                                                                            
  |======================================================================| 100%
## iter 2
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======                                                                |   9%
  |                                                                            
  |=============                                                         |  18%
  |                                                                            
  |===================                                                   |  27%
  |                                                                            
  |=========================                                             |  36%
  |                                                                            
  |================================                                      |  45%
  |                                                                            
  |======================================                                |  55%
  |                                                                            
  |=============================================                         |  64%
  |                                                                            
  |===================================================                   |  73%
  |                                                                            
  |=========================================================             |  82%
  |                                                                            
  |================================================================      |  91%
  |                                                                            
  |======================================================================| 100%
## iter 3
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======                                                                |   9%
  |                                                                            
  |=============                                                         |  18%
  |                                                                            
  |===================                                                   |  27%
  |                                                                            
  |=========================                                             |  36%
  |                                                                            
  |================================                                      |  45%
  |                                                                            
  |======================================                                |  55%
  |                                                                            
  |=============================================                         |  64%
  |                                                                            
  |===================================================                   |  73%
  |                                                                            
  |=========================================================             |  82%
  |                                                                            
  |================================================================      |  91%
  |                                                                            
  |======================================================================| 100%

The imputation look 3 iterations which is fast, if it took 100 iterations then it is a sign that the imputation model is bad.

12. Run association analysis

During feature selection we will use our association analysis to identify which predictors to keep and which to drop.

For numeric predictors we will use Spearman’s correlation since it is better at detecting non-linear monotonic relationships.

For categorical predictors we will use Cramer’s V in place of correlation.

M_cor <- cor(
  x = M[,v_numeric],
  method = "spearman",
  use = "pairwise.complete.obs"
)
M_CramerV <- DescTools::PairApply(
  x = M[,names(v_wilcox.test)],
  FUN = DescTools::CramerV
)
corrplot::corrplot(
  corr = M_cor,
  diag = FALSE,
  is.corr = FALSE#,           # there was a problem with the pool column for this data set so this portion was excluded
  #order = "hclust",          # this portion should definitely be ran, as a result we will only get correlation info based on 2 variables
  #hclust.method = "ward.D"   # instead of an extra piece of information from ordering with hclust which would have told us
)                             # which group of variables are giving the same information for feature selection removal 

corrplot::corrplot(
  corr = M_CramerV,
  diag = FALSE,
  is.corr = FALSE,
  order = "hclust",
  hclust.method = "ward.D"
)

Notice the squares in the correlation matrix, this means if we remove one variable, the other variables in that square could pick up its slack.

  1. Perform variable clustering

Variable clustering on the numeric variables

Remember wards D gives us more even group for feature selection variable removal and single linkage does a good job at identifying outlier telling us which variables we shouldn’t remove.

plot(
  x = hclust(
    d = as.dist(
      m = 1 - M_cor
      ),
    method = "ward.D" 
  )
)

If we get rude if lot_frontage, lot_area could cover for it.

plot(
  x = hclust(
    d = as.dist(
      m = 1 - M_cor
    ),
    method = "single"
  )
)

Year sold is very different from all of the other predictor variables so we shouldnt throw it out. ## 14. Variable clustering on the categorical variables

plot(
  x = hclust(
    d = as.dist(
      m = 1 - M_CramerV
      ),
    method = "ward.D"
  )
)

External quality and overall quality contain the same information, we could remove one or the other.

plot(
  x = hclust(
    d = as.dist(
      m = 1 - M_CramerV
    ),
    method = "single"
  )
)

We can see that fence is distinct from all other predictor variables so we should do not remove it because it contains unique information.

Prepare the categorical and ordinal columns

To make model fitting faster, and to prevent over-fitting categorical and ordinal columns will be binned into two levels and represented with a binary column. The binning strategy will try for 50/50 bins. (If dont understand, check this portion on categorical data prep notes 02)

Bin nominal levels into binary

two strategies

group low frequecy levels into “Other”

v_other <- c(
  "CentralAir","PavedDrive","BsmtExposure","MSZoning","BsmtCond","LandContour",
  "Street","Alley","BldgType","Fence","SaleCondition","LandSlope","GarageQual",
  "Utilities","ExterCond","GarageCond","Condition1","BsmtFinType2","LotConfig",
  "Functional","Heating","Electrical","SaleType","MiscFeature","PoolQC",
  "RoofStyle","RoofMatl","Condition2"
)
for(j in v_other){
  v_table <- sort(
    x = table(
      x = M[,j]
    ),
    decreasing = TRUE
  )
  M[,j] <- as.numeric(
    x = M[,j] == names(v_table)[1]
  )
  v_table <- unclass(
    x = v_table
  )
  M_class <- data.frame(
    level = names(v_table),
    n = v_table,
    proportion = prop.table(v_table)
  )
  print(
    x = "---------------------------------------------------------------------"
  )
  print(
    x = j
  )
  print(
    x = M_class
  )
}
## [1] "---------------------------------------------------------------------"
## [1] "CentralAir"
##   level    n proportion
## Y     Y 2723      0.933
## N     N  196      0.067
## [1] "---------------------------------------------------------------------"
## [1] "PavedDrive"
##   level    n proportion
## Y     Y 2641      0.905
## N     N  216      0.074
## P     P   62      0.021
## [1] "---------------------------------------------------------------------"
## [1] "BsmtExposure"
##     level    n proportion
## No     No 1904      0.652
## Av     Av  418      0.143
## Gd     Gd  276      0.095
## Mn     Mn  239      0.082
## N/A   N/A   82      0.028
## [1] "---------------------------------------------------------------------"
## [1] "MSZoning"
##           level    n proportion
## RL           RL 2265     0.7760
## RM           RM  460     0.1576
## FV           FV  139     0.0476
## RH           RH   26     0.0089
## C (all) C (all)   25     0.0086
## N/A         N/A    4     0.0014
## [1] "---------------------------------------------------------------------"
## [1] "BsmtCond"
##     level    n proportion
## TA     TA 2606     0.8928
## Gd     Gd  122     0.0418
## Fa     Fa  104     0.0356
## N/A   N/A   82     0.0281
## Po     Po    5     0.0017
## [1] "---------------------------------------------------------------------"
## [1] "LandContour"
##     level    n proportion
## Lvl   Lvl 2622      0.898
## HLS   HLS  120      0.041
## Bnk   Bnk  117      0.040
## Low   Low   60      0.021
## [1] "---------------------------------------------------------------------"
## [1] "Street"
##      level    n proportion
## Pave  Pave 2907     0.9959
## Grvl  Grvl   12     0.0041
## [1] "---------------------------------------------------------------------"
## [1] "Alley"
##      level    n proportion
## N/A    N/A 2721      0.932
## Grvl  Grvl  120      0.041
## Pave  Pave   78      0.027
## [1] "---------------------------------------------------------------------"
## [1] "BldgType"
##         level    n proportion
## 1Fam     1Fam 2425      0.831
## TwnhsE TwnhsE  227      0.078
## Duplex Duplex  109      0.037
## Twnhs   Twnhs   96      0.033
## 2fmCon 2fmCon   62      0.021
## [1] "---------------------------------------------------------------------"
## [1] "Fence"
##       level    n proportion
## N/A     N/A 2348     0.8044
## MnPrv MnPrv  329     0.1127
## GdPrv GdPrv  118     0.0404
## GdWo   GdWo  112     0.0384
## MnWw   MnWw   12     0.0041
## [1] "---------------------------------------------------------------------"
## [1] "SaleCondition"
##           level    n proportion
## Normal   Normal 2402     0.8229
## Partial Partial  245     0.0839
## Abnorml Abnorml  190     0.0651
## Family   Family   46     0.0158
## Alloca   Alloca   24     0.0082
## AdjLand AdjLand   12     0.0041
## [1] "---------------------------------------------------------------------"
## [1] "LandSlope"
##     level    n proportion
## Gtl   Gtl 2778     0.9517
## Mod   Mod  125     0.0428
## Sev   Sev   16     0.0055
## [1] "---------------------------------------------------------------------"
## [1] "GarageQual"
##     level    n proportion
## TA     TA 2604     0.8921
## N/A   N/A  159     0.0545
## Fa     Fa  124     0.0425
## Gd     Gd   24     0.0082
## Po     Po    5     0.0017
## Ex     Ex    3     0.0010
## [1] "---------------------------------------------------------------------"
## [1] "Utilities"
##         level    n proportion
## AllPub AllPub 2916    0.99897
## N/A       N/A    2    0.00069
## NoSeWa NoSeWa    1    0.00034
## [1] "---------------------------------------------------------------------"
## [1] "ExterCond"
##    level    n proportion
## TA    TA 2538     0.8695
## Gd    Gd  299     0.1024
## Fa    Fa   67     0.0230
## Ex    Ex   12     0.0041
## Po    Po    3     0.0010
## [1] "---------------------------------------------------------------------"
## [1] "GarageCond"
##     level    n proportion
## TA     TA 2654     0.9092
## N/A   N/A  159     0.0545
## Fa     Fa   74     0.0254
## Gd     Gd   15     0.0051
## Po     Po   14     0.0048
## Ex     Ex    3     0.0010
## [1] "---------------------------------------------------------------------"
## [1] "Condition1"
##         level    n proportion
## Norm     Norm 2511     0.8602
## Feedr   Feedr  164     0.0562
## Artery Artery   92     0.0315
## RRAn     RRAn   50     0.0171
## PosN     PosN   39     0.0134
## RRAe     RRAe   28     0.0096
## PosA     PosA   20     0.0069
## RRNn     RRNn    9     0.0031
## RRNe     RRNe    6     0.0021
## [1] "---------------------------------------------------------------------"
## [1] "BsmtFinType2"
##     level    n proportion
## Unf   Unf 2493      0.854
## Rec   Rec  105      0.036
## LwQ   LwQ   87      0.030
## N/A   N/A   80      0.027
## BLQ   BLQ   68      0.023
## ALQ   ALQ   52      0.018
## GLQ   GLQ   34      0.012
## [1] "---------------------------------------------------------------------"
## [1] "LotConfig"
##           level    n proportion
## Inside   Inside 2133     0.7307
## Corner   Corner  511     0.1751
## CulDSac CulDSac  176     0.0603
## FR2         FR2   85     0.0291
## FR3         FR3   14     0.0048
## [1] "---------------------------------------------------------------------"
## [1] "Functional"
##      level    n proportion
## Typ    Typ 2717    0.93080
## Min2  Min2   70    0.02398
## Min1  Min1   65    0.02227
## Mod    Mod   35    0.01199
## Maj1  Maj1   19    0.00651
## Maj2  Maj2    9    0.00308
## Sev    Sev    2    0.00069
## N/A    N/A    2    0.00069
## [1] "---------------------------------------------------------------------"
## [1] "Heating"
##       level    n proportion
## GasA   GasA 2874    0.98458
## GasW   GasW   27    0.00925
## Grav   Grav    9    0.00308
## Wall   Wall    6    0.00206
## OthW   OthW    2    0.00069
## Floor Floor    1    0.00034
## [1] "---------------------------------------------------------------------"
## [1] "Electrical"
##       level    n proportion
## SBrkr SBrkr 2671    0.91504
## FuseA FuseA  188    0.06441
## FuseF FuseF   50    0.01713
## FuseP FuseP    8    0.00274
## Mix     Mix    1    0.00034
## N/A     N/A    1    0.00034
## [1] "---------------------------------------------------------------------"
## [1] "SaleType"
##       level    n proportion
## WD       WD 2525    0.86502
## New     New  239    0.08188
## COD     COD   87    0.02980
## ConLD ConLD   26    0.00891
## CWD     CWD   12    0.00411
## ConLI ConLI    9    0.00308
## ConLw ConLw    8    0.00274
## Oth     Oth    7    0.00240
## Con     Con    5    0.00171
## N/A     N/A    1    0.00034
## [1] "---------------------------------------------------------------------"
## [1] "MiscFeature"
##      level    n proportion
## N/A    N/A 2814    0.96403
## Shed  Shed   95    0.03255
## Gar2  Gar2    5    0.00171
## Othr  Othr    4    0.00137
## TenC  TenC    1    0.00034
## [1] "---------------------------------------------------------------------"
## [1] "PoolQC"
##     level    n proportion
## N/A   N/A 2909    0.99657
## Ex     Ex    4    0.00137
## Gd     Gd    4    0.00137
## Fa     Fa    2    0.00069
## [1] "---------------------------------------------------------------------"
## [1] "RoofStyle"
##           level    n proportion
## Gable     Gable 2310     0.7914
## Hip         Hip  551     0.1888
## Gambrel Gambrel   22     0.0075
## Flat       Flat   20     0.0069
## Mansard Mansard   11     0.0038
## Shed       Shed    5     0.0017
## [1] "---------------------------------------------------------------------"
## [1] "RoofMatl"
##           level    n proportion
## CompShg CompShg 2876    0.98527
## Tar&Grv Tar&Grv   23    0.00788
## WdShake WdShake    9    0.00308
## WdShngl WdShngl    7    0.00240
## ClyTile ClyTile    1    0.00034
## Membran Membran    1    0.00034
## Metal     Metal    1    0.00034
## Roll       Roll    1    0.00034
## [1] "---------------------------------------------------------------------"
## [1] "Condition2"
##         level    n proportion
## Norm     Norm 2889    0.98972
## Feedr   Feedr   13    0.00445
## Artery Artery    5    0.00171
## PosN     PosN    4    0.00137
## PosA     PosA    4    0.00137
## RRNn     RRNn    2    0.00069
## RRAn     RRAn    1    0.00034
## RRAe     RRAe    1    0.00034

group by conditional mean

v_mean <- c(
  "GarageFinish","KitchenQual","ExterQual","BsmtQual","FireplaceQu",
  "OverallQual","MasVnrType","GarageType","HeatingQC","BsmtFinType1",
  "Neighborhood","LotShape","HouseStyle","Foundation","MSSubClass",
  "OverallCond","Exterior1st","Exterior2nd"
)
for(j in v_mean){
  M_BoxCox_SalePrice <- M %>%
    dplyr::select_(j,"BoxCox_SalePrice") %>%
    dplyr::mutate_(j = as.character(j)) %>%
    dplyr::group_by_(j) %>%
    dplyr::summarise(BoxCox_SalePrice = mean(BoxCox_SalePrice,na.rm = TRUE),n = dplyr::n()) %>%
    dplyr::ungroup() %>%
    dplyr::mutate(proportion = n/sum(n)) %>%
    dplyr::arrange(BoxCox_SalePrice) %>%
    dplyr::mutate(cumsum_ascending = cumsum(proportion)) %>%
    dplyr::arrange(dplyr::desc(cumsum_ascending)) %>%
    dplyr::mutate(cumsum_descending = 1 - cumsum(proportion)) %>%
    dplyr::arrange(cumsum_ascending) %>%
    dplyr::mutate(mean_cumsum = (cumsum_ascending + cumsum_descending)/2) %>%
    dplyr::arrange(mean_cumsum) %>%
    as.data.frame()
  v_j <- levels(M_BoxCox_SalePrice[,j])[M_BoxCox_SalePrice$mean_cumsum >= 0.5]
  M[,j] <- as.numeric(
    x = M[,j] %in% M_BoxCox_SalePrice[M_BoxCox_SalePrice$mean_cumsum >= 0.5,j]
  )
  print(
    x = "---------------------------------------------------------------------"
  )
  print(
    x = j
  )
  print(knitr::kable(
    M_BoxCox_SalePrice
  ))
}
## [1] "---------------------------------------------------------------------"
## [1] "GarageFinish"
## 
## 
## |GarageFinish | BoxCox_SalePrice|    n| proportion| cumsum_ascending| cumsum_descending| mean_cumsum|
## |:------------|----------------:|----:|----------:|----------------:|-----------------:|-----------:|
## |N/A          |               11|  159|       0.05|             0.05|              0.00|        0.03|
## |Unf          |               12| 1230|       0.42|             0.48|              0.05|        0.27|
## |RFn          |               12|  811|       0.28|             0.75|              0.48|        0.61|
## |Fin          |               12|  719|       0.25|             1.00|              0.75|        0.88|
## [1] "---------------------------------------------------------------------"
## [1] "KitchenQual"
## 
## 
## |KitchenQual | BoxCox_SalePrice|    n| proportion| cumsum_ascending| cumsum_descending| mean_cumsum|
## |:-----------|----------------:|----:|----------:|----------------:|-----------------:|-----------:|
## |Fa          |               12|   70|       0.02|             0.02|              0.00|        0.01|
## |TA          |               12| 1492|       0.51|             0.54|              0.02|        0.28|
## |Gd          |               12| 1151|       0.39|             0.93|              0.54|        0.73|
## |Ex          |               13|  205|       0.07|             1.00|              0.93|        0.96|
## |N/A         |              NaN|    1|       0.00|             1.00|              1.00|        1.00|
## [1] "---------------------------------------------------------------------"
## [1] "ExterQual"
## 
## 
## |ExterQual | BoxCox_SalePrice|    n| proportion| cumsum_ascending| cumsum_descending| mean_cumsum|
## |:---------|----------------:|----:|----------:|----------------:|-----------------:|-----------:|
## |Fa        |               11|   35|       0.01|             0.01|              0.00|        0.01|
## |TA        |               12| 1798|       0.62|             0.63|              0.01|        0.32|
## |Gd        |               12|  979|       0.34|             0.96|              0.63|        0.80|
## |Ex        |               13|  107|       0.04|             1.00|              0.96|        0.98|
## [1] "---------------------------------------------------------------------"
## [1] "BsmtQual"
## 
## 
## |BsmtQual | BoxCox_SalePrice|    n| proportion| cumsum_ascending| cumsum_descending| mean_cumsum|
## |:--------|----------------:|----:|----------:|----------------:|-----------------:|-----------:|
## |N/A      |               12|   81|       0.03|             0.03|              0.00|        0.01|
## |Fa       |               12|   88|       0.03|             0.06|              0.03|        0.04|
## |TA       |               12| 1283|       0.44|             0.50|              0.06|        0.28|
## |Gd       |               12| 1209|       0.41|             0.91|              0.50|        0.70|
## |Ex       |               13|  258|       0.09|             1.00|              0.91|        0.96|
## [1] "---------------------------------------------------------------------"
## [1] "FireplaceQu"
## 
## 
## |FireplaceQu | BoxCox_SalePrice|    n| proportion| cumsum_ascending| cumsum_descending| mean_cumsum|
## |:-----------|----------------:|----:|----------:|----------------:|-----------------:|-----------:|
## |Po          |               12|   46|       0.02|             0.02|              0.00|        0.01|
## |N/A         |               12| 1420|       0.49|             0.50|              0.02|        0.26|
## |Fa          |               12|   74|       0.03|             0.53|              0.50|        0.51|
## |TA          |               12|  592|       0.20|             0.73|              0.53|        0.63|
## |Gd          |               12|  744|       0.25|             0.99|              0.73|        0.86|
## |Ex          |               13|   43|       0.01|             1.00|              0.99|        0.99|
## [1] "---------------------------------------------------------------------"
## [1] "OverallQual"
## 
## 
## |OverallQual | BoxCox_SalePrice|   n| proportion| cumsum_ascending| cumsum_descending| mean_cumsum|
## |:-----------|----------------:|---:|----------:|----------------:|-----------------:|-----------:|
## |1           |               11|   4|       0.00|             0.00|              0.00|        0.00|
## |2           |               11|  13|       0.00|             0.01|              0.00|        0.00|
## |3           |               11|  40|       0.01|             0.02|              0.01|        0.01|
## |4           |               12| 226|       0.08|             0.10|              0.02|        0.06|
## |5           |               12| 825|       0.28|             0.38|              0.10|        0.24|
## |6           |               12| 731|       0.25|             0.63|              0.38|        0.50|
## |7           |               12| 600|       0.21|             0.84|              0.63|        0.73|
## |8           |               12| 342|       0.12|             0.95|              0.84|        0.89|
## |9           |               13| 107|       0.04|             0.99|              0.95|        0.97|
## |10          |               13|  31|       0.01|             1.00|              0.99|        0.99|
## [1] "---------------------------------------------------------------------"
## [1] "MasVnrType"
## 
## 
## |MasVnrType | BoxCox_SalePrice|    n| proportion| cumsum_ascending| cumsum_descending| mean_cumsum|
## |:----------|----------------:|----:|----------:|----------------:|-----------------:|-----------:|
## |BrkCmn     |               12|   25|       0.01|             0.01|              0.00|        0.00|
## |None       |               12| 1742|       0.60|             0.61|              0.01|        0.31|
## |BrkFace    |               12|  879|       0.30|             0.91|              0.61|        0.76|
## |N/A        |               12|   24|       0.01|             0.91|              0.91|        0.91|
## |Stone      |               12|  249|       0.09|             1.00|              0.91|        0.96|
## [1] "---------------------------------------------------------------------"
## [1] "GarageType"
## 
## 
## |GarageType | BoxCox_SalePrice|    n| proportion| cumsum_ascending| cumsum_descending| mean_cumsum|
## |:----------|----------------:|----:|----------:|----------------:|-----------------:|-----------:|
## |N/A        |               11|  157|       0.05|             0.05|              0.00|        0.03|
## |CarPort    |               12|   15|       0.01|             0.06|              0.05|        0.06|
## |Detchd     |               12|  779|       0.27|             0.33|              0.06|        0.19|
## |2Types     |               12|   23|       0.01|             0.33|              0.33|        0.33|
## |Basment    |               12|   36|       0.01|             0.35|              0.33|        0.34|
## |Attchd     |               12| 1723|       0.59|             0.94|              0.35|        0.64|
## |BuiltIn    |               12|  186|       0.06|             1.00|              0.94|        0.97|
## [1] "---------------------------------------------------------------------"
## [1] "HeatingQC"
## 
## 
## |HeatingQC | BoxCox_SalePrice|    n| proportion| cumsum_ascending| cumsum_descending| mean_cumsum|
## |:---------|----------------:|----:|----------:|----------------:|-----------------:|-----------:|
## |Po        |               11|    3|       0.00|             0.00|              0.00|        0.00|
## |Fa        |               12|   92|       0.03|             0.03|              0.00|        0.02|
## |TA        |               12|  857|       0.29|             0.33|              0.03|        0.18|
## |Gd        |               12|  474|       0.16|             0.49|              0.33|        0.41|
## |Ex        |               12| 1493|       0.51|             1.00|              0.49|        0.74|
## [1] "---------------------------------------------------------------------"
## [1] "BsmtFinType1"
## 
## 
## |BsmtFinType1 | BoxCox_SalePrice|   n| proportion| cumsum_ascending| cumsum_descending| mean_cumsum|
## |:------------|----------------:|---:|----------:|----------------:|-----------------:|-----------:|
## |N/A          |               12|  79|       0.03|             0.03|              0.00|        0.01|
## |Rec          |               12| 288|       0.10|             0.13|              0.03|        0.08|
## |BLQ          |               12| 269|       0.09|             0.22|              0.13|        0.17|
## |LwQ          |               12| 154|       0.05|             0.27|              0.22|        0.24|
## |ALQ          |               12| 429|       0.15|             0.42|              0.27|        0.34|
## |Unf          |               12| 851|       0.29|             0.71|              0.42|        0.56|
## |GLQ          |               12| 849|       0.29|             1.00|              0.71|        0.85|
## [1] "---------------------------------------------------------------------"
## [1] "Neighborhood"
## 
## 
## |Neighborhood | BoxCox_SalePrice|   n| proportion| cumsum_ascending| cumsum_descending| mean_cumsum|
## |:------------|----------------:|---:|----------:|----------------:|-----------------:|-----------:|
## |IDOTRR       |               11|  93|       0.03|             0.03|              0.00|        0.02|
## |MeadowV      |               11|  37|       0.01|             0.04|              0.03|        0.04|
## |BrDale       |               12|  30|       0.01|             0.05|              0.04|        0.05|
## |BrkSide      |               12| 108|       0.04|             0.09|              0.05|        0.07|
## |OldTown      |               12| 239|       0.08|             0.17|              0.09|        0.13|
## |Edwards      |               12| 194|       0.07|             0.24|              0.17|        0.21|
## |Sawyer       |               12| 151|       0.05|             0.29|              0.24|        0.27|
## |Blueste      |               12|  10|       0.00|             0.30|              0.29|        0.29|
## |SWISU        |               12|  48|       0.02|             0.31|              0.30|        0.30|
## |NPkVill      |               12|  23|       0.01|             0.32|              0.31|        0.32|
## |NAmes        |               12| 443|       0.15|             0.47|              0.32|        0.40|
## |Mitchel      |               12| 114|       0.04|             0.51|              0.47|        0.49|
## |SawyerW      |               12| 125|       0.04|             0.55|              0.51|        0.53|
## |NWAmes       |               12| 131|       0.04|             0.60|              0.55|        0.58|
## |Gilbert      |               12| 165|       0.06|             0.65|              0.60|        0.63|
## |CollgCr      |               12| 267|       0.09|             0.75|              0.65|        0.70|
## |Blmngtn      |               12|  28|       0.01|             0.76|              0.75|        0.75|
## |Crawfor      |               12| 103|       0.04|             0.79|              0.76|        0.77|
## |ClearCr      |               12|  44|       0.02|             0.81|              0.79|        0.80|
## |Somerst      |               12| 182|       0.06|             0.87|              0.81|        0.84|
## |Veenker      |               12|  24|       0.01|             0.88|              0.87|        0.87|
## |Timber       |               12|  72|       0.02|             0.90|              0.88|        0.89|
## |StoneBr      |               13|  51|       0.02|             0.92|              0.90|        0.91|
## |NridgHt      |               13| 166|       0.06|             0.98|              0.92|        0.95|
## |NoRidge      |               13|  71|       0.02|             1.00|              0.98|        0.99|
## [1] "---------------------------------------------------------------------"
## [1] "LotShape"
## 
## 
## |LotShape | BoxCox_SalePrice|    n| proportion| cumsum_ascending| cumsum_descending| mean_cumsum|
## |:--------|----------------:|----:|----------:|----------------:|-----------------:|-----------:|
## |Reg      |               12| 1859|       0.64|             0.64|              0.00|        0.32|
## |IR1      |               12|  968|       0.33|             0.97|              0.64|        0.80|
## |IR3      |               12|   16|       0.01|             0.97|              0.97|        0.97|
## |IR2      |               12|   76|       0.03|             1.00|              0.97|        0.99|
## [1] "---------------------------------------------------------------------"
## [1] "HouseStyle"
## 
## 
## |HouseStyle | BoxCox_SalePrice|    n| proportion| cumsum_ascending| cumsum_descending| mean_cumsum|
## |:----------|----------------:|----:|----------:|----------------:|-----------------:|-----------:|
## |1.5Unf     |               12|   19|       0.01|             0.01|              0.00|        0.00|
## |SFoyer     |               12|   83|       0.03|             0.03|              0.01|        0.02|
## |1.5Fin     |               12|  314|       0.11|             0.14|              0.03|        0.09|
## |2.5Unf     |               12|   24|       0.01|             0.15|              0.14|        0.15|
## |1Story     |               12| 1471|       0.50|             0.65|              0.15|        0.40|
## |SLvl       |               12|  128|       0.04|             0.70|              0.65|        0.68|
## |2Story     |               12|  872|       0.30|             1.00|              0.70|        0.85|
## |2.5Fin     |               12|    8|       0.00|             1.00|              1.00|        1.00|
## [1] "---------------------------------------------------------------------"
## [1] "Foundation"
## 
## 
## |Foundation | BoxCox_SalePrice|    n| proportion| cumsum_ascending| cumsum_descending| mean_cumsum|
## |:----------|----------------:|----:|----------:|----------------:|-----------------:|-----------:|
## |Slab       |               12|   49|       0.02|             0.02|              0.00|        0.01|
## |BrkTil     |               12|  311|       0.11|             0.12|              0.02|        0.07|
## |CBlock     |               12| 1235|       0.42|             0.55|              0.12|        0.33|
## |Stone      |               12|   11|       0.00|             0.55|              0.55|        0.55|
## |Wood       |               12|    5|       0.00|             0.55|              0.55|        0.55|
## |PConc      |               12| 1308|       0.45|             1.00|              0.55|        0.78|
## [1] "---------------------------------------------------------------------"
## [1] "MSSubClass"
## 
## 
## |MSSubClass | BoxCox_SalePrice|    n| proportion| cumsum_ascending| cumsum_descending| mean_cumsum|
## |:----------|----------------:|----:|----------:|----------------:|-----------------:|-----------:|
## |30         |               11|  139|       0.05|             0.05|              0.00|        0.02|
## |180        |               12|   17|       0.01|             0.05|              0.05|        0.05|
## |45         |               12|   18|       0.01|             0.06|              0.05|        0.06|
## |190        |               12|   61|       0.02|             0.08|              0.06|        0.07|
## |90         |               12|  109|       0.04|             0.12|              0.08|        0.10|
## |160        |               12|  128|       0.04|             0.16|              0.12|        0.14|
## |50         |               12|  287|       0.10|             0.26|              0.16|        0.21|
## |40         |               12|    6|       0.00|             0.26|              0.26|        0.26|
## |85         |               12|   48|       0.02|             0.28|              0.26|        0.27|
## |70         |               12|  128|       0.04|             0.32|              0.28|        0.30|
## |80         |               12|  118|       0.04|             0.36|              0.32|        0.34|
## |20         |               12| 1079|       0.37|             0.73|              0.36|        0.55|
## |75         |               12|   23|       0.01|             0.74|              0.73|        0.74|
## |120        |               12|  182|       0.06|             0.80|              0.74|        0.77|
## |60         |               12|  575|       0.20|             1.00|              0.80|        0.90|
## |150        |              NaN|    1|       0.00|             1.00|              1.00|        1.00|
## [1] "---------------------------------------------------------------------"
## [1] "OverallCond"
## 
## 
## |OverallCond | BoxCox_SalePrice|    n| proportion| cumsum_ascending| cumsum_descending| mean_cumsum|
## |:-----------|----------------:|----:|----------:|----------------:|-----------------:|-----------:|
## |1           |               11|    7|       0.00|             0.00|              0.00|        0.00|
## |3           |               11|   50|       0.02|             0.02|              0.00|        0.01|
## |2           |               12|   10|       0.00|             0.02|              0.02|        0.02|
## |4           |               12|  101|       0.03|             0.06|              0.02|        0.04|
## |6           |               12|  531|       0.18|             0.24|              0.06|        0.15|
## |8           |               12|  144|       0.05|             0.29|              0.24|        0.26|
## |7           |               12|  390|       0.13|             0.42|              0.29|        0.36|
## |5           |               12| 1645|       0.56|             0.99|              0.42|        0.70|
## |9           |               12|   41|       0.01|             1.00|              0.99|        0.99|
## [1] "---------------------------------------------------------------------"
## [1] "Exterior1st"
## 
## 
## |Exterior1st | BoxCox_SalePrice|    n| proportion| cumsum_ascending| cumsum_descending| mean_cumsum|
## |:-----------|----------------:|----:|----------:|----------------:|-----------------:|-----------:|
## |BrkComm     |               11|    6|       0.00|             0.00|              0.00|        0.00|
## |AsphShn     |               12|    2|       0.00|             0.00|              0.00|        0.00|
## |AsbShng     |               12|   44|       0.02|             0.02|              0.00|        0.01|
## |CBlock      |               12|    2|       0.00|             0.02|              0.02|        0.02|
## |Wd Sdng     |               12|  411|       0.14|             0.16|              0.02|        0.09|
## |WdShing     |               12|   56|       0.02|             0.18|              0.16|        0.17|
## |MetalSd     |               12|  450|       0.15|             0.33|              0.18|        0.26|
## |Stucco      |               12|   43|       0.01|             0.35|              0.33|        0.34|
## |HdBoard     |               12|  442|       0.15|             0.50|              0.35|        0.42|
## |Plywood     |               12|  221|       0.08|             0.57|              0.50|        0.54|
## |BrkFace     |               12|   87|       0.03|             0.60|              0.57|        0.59|
## |CemntBd     |               12|  126|       0.04|             0.65|              0.60|        0.63|
## |VinylSd     |               12| 1025|       0.35|             1.00|              0.65|        0.82|
## |Stone       |               12|    2|       0.00|             1.00|              1.00|        1.00|
## |ImStucc     |               12|    1|       0.00|             1.00|              1.00|        1.00|
## |N/A         |              NaN|    1|       0.00|             1.00|              1.00|        1.00|
## [1] "---------------------------------------------------------------------"
## [1] "Exterior2nd"
## 
## 
## |Exterior2nd | BoxCox_SalePrice|    n| proportion| cumsum_ascending| cumsum_descending| mean_cumsum|
## |:-----------|----------------:|----:|----------:|----------------:|-----------------:|-----------:|
## |CBlock      |               12|    3|       0.00|             0.00|              0.00|        0.00|
## |AsbShng     |               12|   38|       0.01|             0.01|              0.00|        0.01|
## |Brk Cmn     |               12|   22|       0.01|             0.02|              0.01|        0.02|
## |AsphShn     |               12|    4|       0.00|             0.02|              0.02|        0.02|
## |Wd Sdng     |               12|  391|       0.13|             0.16|              0.02|        0.09|
## |Stucco      |               12|   47|       0.02|             0.17|              0.16|        0.16|
## |MetalSd     |               12|  447|       0.15|             0.33|              0.17|        0.25|
## |Wd Shng     |               12|   81|       0.03|             0.35|              0.33|        0.34|
## |Stone       |               12|    6|       0.00|             0.36|              0.35|        0.35|
## |HdBoard     |               12|  406|       0.14|             0.50|              0.36|        0.43|
## |Plywood     |               12|  270|       0.09|             0.59|              0.50|        0.54|
## |BrkFace     |               12|   47|       0.02|             0.60|              0.59|        0.60|
## |CmentBd     |               12|  126|       0.04|             0.65|              0.60|        0.63|
## |VinylSd     |               12| 1014|       0.35|             0.99|              0.65|        0.82|
## |ImStucc     |               12|   15|       0.01|             1.00|              0.99|        1.00|
## |Other       |               13|    1|       0.00|             1.00|              1.00|        1.00|
## |N/A         |              NaN|    1|       0.00|             1.00|              1.00|        1.00|

Check results

summary(
  object = M
)
##       Id              MSSubClass      MSZoning     lot_frontage    lot_area  
##  Length:2919        Min.   :0.00   Min.   :0.00   Min.   : 21   Min.   : 46  
##  Class :character   1st Qu.:0.00   1st Qu.:1.00   1st Qu.: 60   1st Qu.: 86  
##  Mode  :character   Median :1.00   Median :1.00   Median : 70   Median : 95  
##                     Mean   :0.64   Mean   :0.78   Mean   : 70   Mean   : 94  
##                     3rd Qu.:1.00   3rd Qu.:1.00   3rd Qu.: 80   3rd Qu.:103  
##                     Max.   :1.00   Max.   :1.00   Max.   :136   Max.   :158  
##                                                                              
##      Street      Alley         LotShape     LandContour    Utilities
##  Min.   :0   Min.   :0.00   Min.   :0.00   Min.   :0.0   Min.   :0  
##  1st Qu.:1   1st Qu.:1.00   1st Qu.:0.00   1st Qu.:1.0   1st Qu.:1  
##  Median :1   Median :1.00   Median :0.00   Median :1.0   Median :1  
##  Mean   :1   Mean   :0.93   Mean   :0.36   Mean   :0.9   Mean   :1  
##  3rd Qu.:1   3rd Qu.:1.00   3rd Qu.:1.00   3rd Qu.:1.0   3rd Qu.:1  
##  Max.   :1   Max.   :1.00   Max.   :1.00   Max.   :1.0   Max.   :1  
##                                                                     
##    LotConfig      LandSlope     Neighborhood    Condition1     Condition2  
##  Min.   :0.00   Min.   :0.00   Min.   :0.00   Min.   :0.00   Min.   :0.00  
##  1st Qu.:0.00   1st Qu.:1.00   1st Qu.:0.00   1st Qu.:1.00   1st Qu.:1.00  
##  Median :1.00   Median :1.00   Median :0.00   Median :1.00   Median :1.00  
##  Mean   :0.73   Mean   :0.95   Mean   :0.49   Mean   :0.86   Mean   :0.99  
##  3rd Qu.:1.00   3rd Qu.:1.00   3rd Qu.:1.00   3rd Qu.:1.00   3rd Qu.:1.00  
##  Max.   :1.00   Max.   :1.00   Max.   :1.00   Max.   :1.00   Max.   :1.00  
##                                                                            
##     BldgType      HouseStyle    OverallQual    OverallCond     year_built     
##  Min.   :0.00   Min.   :0.00   Min.   :0.00   Min.   :0.00   Min.   :1805000  
##  1st Qu.:1.00   1st Qu.:0.00   1st Qu.:0.00   1st Qu.:0.00   1st Qu.:1908081  
##  Median :1.00   Median :0.00   Median :1.00   Median :1.00   Median :1946364  
##  Mean   :0.83   Mean   :0.35   Mean   :0.62   Mean   :0.58   Mean   :1943692  
##  3rd Qu.:1.00   3rd Qu.:1.00   3rd Qu.:1.00   3rd Qu.:1.00   3rd Qu.:2002000  
##  Max.   :1.00   Max.   :1.00   Max.   :1.00   Max.   :1.00   Max.   :2016032  
##                                                                               
##  year_remod_add      RoofStyle       RoofMatl     Exterior1st   Exterior2nd 
##  Min.   :1901250   Min.   :0.00   Min.   :0.00   Min.   :0.0   Min.   :0.0  
##  1st Qu.:1930612   1st Qu.:1.00   1st Qu.:1.00   1st Qu.:0.0   1st Qu.:0.0  
##  Median :1986024   Median :1.00   Median :1.00   Median :1.0   Median :1.0  
##  Mean   :1968862   Mean   :0.79   Mean   :0.99   Mean   :0.5   Mean   :0.5  
##  3rd Qu.:2008008   3rd Qu.:1.00   3rd Qu.:1.00   3rd Qu.:1.0   3rd Qu.:1.0  
##  Max.   :2018040   Max.   :1.00   Max.   :1.00   Max.   :1.0   Max.   :1.0  
##                                                                             
##    MasVnrType    mas_vnr_area   ExterQual      ExterCond      Foundation  
##  Min.   :0.00   Min.   :0.0   Min.   :0.00   Min.   :0.00   Min.   :0.00  
##  1st Qu.:0.00   1st Qu.:0.0   1st Qu.:0.00   1st Qu.:1.00   1st Qu.:0.00  
##  Median :0.00   Median :0.0   Median :0.00   Median :1.00   Median :0.00  
##  Mean   :0.39   Mean   :1.1   Mean   :0.37   Mean   :0.87   Mean   :0.45  
##  3rd Qu.:1.00   3rd Qu.:2.9   3rd Qu.:1.00   3rd Qu.:1.00   3rd Qu.:1.00  
##  Max.   :1.00   Max.   :3.2   Max.   :1.00   Max.   :1.00   Max.   :1.00  
##                                                                           
##     BsmtQual      BsmtCond     BsmtExposure   BsmtFinType1   bsmt_fin_sf1 
##  Min.   :0.0   Min.   :0.00   Min.   :0.00   Min.   :0.00   Min.   : 0.0  
##  1st Qu.:0.0   1st Qu.:1.00   1st Qu.:0.00   1st Qu.:0.00   1st Qu.: 0.0  
##  Median :1.0   Median :1.00   Median :1.00   Median :1.00   Median :12.5  
##  Mean   :0.5   Mean   :0.89   Mean   :0.65   Mean   :0.58   Mean   : 9.5  
##  3rd Qu.:1.0   3rd Qu.:1.00   3rd Qu.:1.00   3rd Qu.:1.00   3rd Qu.:15.4  
##  Max.   :1.0   Max.   :1.00   Max.   :1.00   Max.   :1.00   Max.   :19.4  
##                                                                           
##   BsmtFinType2   bsmt_fin_sf2   bsmt_unf_sf total_bsmt_sf    Heating    
##  Min.   :0.00   Min.   :0.00   Min.   : 0   Min.   :  0   Min.   :0.00  
##  1st Qu.:1.00   1st Qu.:0.00   1st Qu.:25   1st Qu.:304   1st Qu.:1.00  
##  Median :1.00   Median :0.00   Median :36   Median :365   Median :1.00  
##  Mean   :0.85   Mean   :0.08   Mean   :36   Mean   :378   Mean   :0.98  
##  3rd Qu.:1.00   3rd Qu.:0.00   3rd Qu.:48   3rd Qu.:459   3rd Qu.:1.00  
##  Max.   :1.00   Max.   :0.68   Max.   :70   Max.   :709   Max.   :1.00  
##                                                                         
##    HeatingQC      CentralAir     Electrical    x1st_flr_sf   x2nd_flr_sf 
##  Min.   :0.00   Min.   :0.00   Min.   :0.00   Min.   :6.3   Min.   :0.0  
##  1st Qu.:0.00   1st Qu.:1.00   1st Qu.:1.00   1st Qu.:6.8   1st Qu.:0.0  
##  Median :1.00   Median :1.00   Median :1.00   Median :7.0   Median :0.0  
##  Mean   :0.51   Mean   :0.93   Mean   :0.92   Mean   :7.0   Mean   :1.9  
##  3rd Qu.:1.00   3rd Qu.:1.00   3rd Qu.:1.00   3rd Qu.:7.2   3rd Qu.:4.4  
##  Max.   :1.00   Max.   :1.00   Max.   :1.00   Max.   :7.7   Max.   :4.6  
##                                                                          
##  low_qual_fin_sf  gr_liv_area  bsmt_full_bath bsmt_half_bath   full_bath   
##  Min.   :  0     Min.   :6.5   Min.   :0.00   Min.   :0.00   Min.   :1.00  
##  1st Qu.:  0     1st Qu.:7.0   1st Qu.:0.00   1st Qu.:0.00   1st Qu.:1.00  
##  Median :  0     Median :7.3   Median :0.00   Median :0.00   Median :2.00  
##  Mean   :  2     Mean   :7.3   Mean   :0.43   Mean   :0.06   Mean   :1.57  
##  3rd Qu.:  0     3rd Qu.:7.5   3rd Qu.:1.00   3rd Qu.:0.00   3rd Qu.:2.00  
##  Max.   :154     Max.   :8.0   Max.   :2.00   Max.   :1.00   Max.   :3.00  
##                                                                            
##    half_bath    bedroom_abv_gr kitchen_abv_gr  KitchenQual   tot_rms_abv_grd
##  Min.   :0.00   Min.   :1.0    Min.   :1.00   Min.   :0.00   Min.   :1.39   
##  1st Qu.:0.00   1st Qu.:2.0    1st Qu.:1.00   1st Qu.:0.00   1st Qu.:1.61   
##  Median :0.00   Median :3.0    Median :1.00   Median :0.00   Median :1.79   
##  Mean   :0.37   Mean   :2.9    Mean   :1.04   Mean   :0.46   Mean   :1.84   
##  3rd Qu.:1.00   3rd Qu.:3.0    3rd Qu.:1.00   3rd Qu.:1.00   3rd Qu.:1.95   
##  Max.   :1.00   Max.   :5.0    Max.   :2.00   Max.   :1.00   Max.   :2.40   
##                                                                             
##    Functional     fireplaces    FireplaceQu    GarageType   garage_yr_blt    
##  Min.   :0.00   Min.   :0.00   Min.   :0.0   Min.   :0.00   Min.   :1833612  
##  1st Qu.:1.00   1st Qu.:0.00   1st Qu.:0.0   1st Qu.:0.00   1st Qu.:1916882  
##  Median :1.00   Median :1.00   Median :0.0   Median :1.00   Median :1954264  
##  Mean   :0.93   Mean   :0.59   Mean   :0.5   Mean   :0.65   Mean   :1953346  
##  3rd Qu.:1.00   3rd Qu.:1.00   3rd Qu.:1.0   3rd Qu.:1.00   3rd Qu.:2002000  
##  Max.   :1.00   Max.   :2.00   Max.   :1.0   Max.   :1.00   Max.   :2018040  
##                                                                              
##   GarageFinish   garage_cars    garage_area     GarageQual     GarageCond  
##  Min.   :0.00   Min.   :0.00   Min.   :   0   Min.   :0.00   Min.   :0.00  
##  1st Qu.:0.00   1st Qu.:1.00   1st Qu.: 320   1st Qu.:1.00   1st Qu.:1.00  
##  Median :1.00   Median :2.00   Median : 480   Median :1.00   Median :1.00  
##  Mean   :0.52   Mean   :1.76   Mean   : 471   Mean   :0.89   Mean   :0.91  
##  3rd Qu.:1.00   3rd Qu.:2.00   3rd Qu.: 576   3rd Qu.:1.00   3rd Qu.:1.00  
##  Max.   :1.00   Max.   :3.00   Max.   :1019   Max.   :1.00   Max.   :1.00  
##                                                                            
##    PavedDrive   wood_deck_sf open_porch_sf enclosed_porch  x3ssn_porch 
##  Min.   :0.0   Min.   :0.0   Min.   :0.0   Min.   :0.00   Min.   :  0  
##  1st Qu.:1.0   1st Qu.:0.0   1st Qu.:0.0   1st Qu.:0.00   1st Qu.:  0  
##  Median :1.0   Median :0.0   Median :3.4   Median :0.00   Median :  0  
##  Mean   :0.9   Mean   :2.0   Mean   :2.5   Mean   :0.12   Mean   :  2  
##  3rd Qu.:1.0   3rd Qu.:4.2   3rd Qu.:4.5   3rd Qu.:0.00   3rd Qu.:  0  
##  Max.   :1.0   Max.   :4.9   Max.   :6.1   Max.   :0.77   Max.   :144  
##                                                                        
##   screen_porch     PoolQC      Fence      MiscFeature      misc_val  
##  Min.   :  0   Min.   :0   Min.   :0.0   Min.   :0.00   Min.   :  0  
##  1st Qu.:  0   1st Qu.:1   1st Qu.:1.0   1st Qu.:1.00   1st Qu.:  0  
##  Median :  0   Median :1   Median :1.0   Median :1.00   Median :  0  
##  Mean   : 15   Mean   :1   Mean   :0.8   Mean   :0.96   Mean   : 23  
##  3rd Qu.:  0   3rd Qu.:1   3rd Qu.:1.0   3rd Qu.:1.00   3rd Qu.:  0  
##  Max.   :260   Max.   :1   Max.   :1.0   Max.   :1.00   Max.   :982  
##                                                                      
##     mo_sold        yr_sold        SaleType    SaleCondition    SalePrice     
##  Min.   : 1.0   Min.   :2006   Min.   :0.00   Min.   :0.00   Min.   : 34900  
##  1st Qu.: 4.0   1st Qu.:2007   1st Qu.:1.00   1st Qu.:1.00   1st Qu.:129975  
##  Median : 6.0   Median :2008   Median :1.00   Median :1.00   Median :163000  
##  Mean   : 6.2   Mean   :2008   Mean   :0.87   Mean   :0.82   Mean   :180921  
##  3rd Qu.: 8.0   3rd Qu.:2009   3rd Qu.:1.00   3rd Qu.:1.00   3rd Qu.:214000  
##  Max.   :12.0   Max.   :2010   Max.   :1.00   Max.   :1.00   Max.   :755000  
##                                                              NA's   :1459    
##   TrainTest                         Bin       BoxCox_SalePrice
##  Length:2919        [3.49e+04,1.4e+05]: 487   Min.   :10      
##  Class :character   (1.4e+05,1.9e+05] : 490   1st Qu.:12      
##  Mode  :character   (1.9e+05,7.55e+05]: 483   Median :12      
##                     NA's              :1459   Mean   :12      
##                                               3rd Qu.:12      
##                                               Max.   :14      
##                                               NA's   :1459

Save prepared data

write.csv(
  x = M,
  file = "/Users/thienpham/Data Mining/data/prepared_house_prices.csv",
  row.names = FALSE
)

Homework

Take your data for regression supervised learning and prepare your predictors and your target variable. You do not need to perform feature selection yet, we will do that in an upcoming assignment.